# -*- coding: utf-8 -*-
# @Time: 2022年12月21日 15时23分
# @Email: jianzhou.zhao@mobilemd.cn
# @Author: jianzhou.zhao
# @File: convert_to_CMeEE_format.py
# @notice:
import os
import json
from collections import Counter

from seqeval.metrics.sequence_labeling import get_entities

task = 'emr_v3'
data_path = r"/home/zhoazj/Desktop/codes/projects/emr_ie/ner/data/emr_v3.0/contrast2_2"
save_path = r'/home/zhoazj/Desktop/codes/gitee/BERT-NER-Pytorch/datasets/{}'.format(task)
modes = ['train', 'dev', 'test']
os.makedirs(save_path, exist_ok=True)

all_examples = []
for mode in modes:
    with open(os.path.join(data_path, f'{mode}.txt'), encoding='utf-8') as fr:
        lines = fr.readlines()

    examples = []
    index = 0
    chars = []
    labels = []
    for line in lines:
        line = line.rstrip().split('\t')
        if not line:
            continue
        char = line[0]
        if not char:
            continue
        label = line[-1]
        chars.append(char)
        labels.append(label)
        if char in ['。', '?', '!', '！', '？']:
            text = ''.join(chars)
            example = dict(
                text=text,
                entities=[
                    {"start_idx": start_idx,
                     "end_idx": end_idx,
                     "type": type,
                     "entity": text[start_idx: end_idx + 1]}
                    for type, start_idx, end_idx in get_entities(labels)
                ]
            )
            examples.append(example)
            chars = []
            labels = []

    with open(os.path.join(save_path, f'{mode}.json'), 'w', encoding='utf-8') as fw:
        json.dump(examples, fw, ensure_ascii=False, indent=2)
    all_examples.extend(examples)

counter = Counter([len(example['text']) for example in all_examples])
print(counter)
for k, v in counter.items():
    if k > 256:
        print(k, v)
